#!pip install pybaseball
from pybaseball import statcast, statcast_batter, playerid_lookup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import unidecode
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from scipy.stats import multivariate_normal
from scipy.spatial.distance import mahalanobis
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.mixture import GaussianMixture
from sklearn.metrics import pairwise_distances_argmin_min
import warnings
warnings.filterwarnings('ignore')
# data2021 = statcast('2021-04-01', '2021-10-03', parallel = True)
# data2022 = statcast('2022-03-31', '2022-10-02', parallel = True)
# data2023 = statcast('2023-03-30', '2023-10-01', parallel = True)
data2021 = pd.read_csv('statcast21.csv')
data2022 = pd.read_csv('statcast22.csv')
sorted_names = data2021.player_name.value_counts(sort=True).index
pitch_totals = data2021.player_name.value_counts(sort=True)
#for i,j in zip(sorted_names,pitch_totals):
# if j <= 150:
# print(i,j)
# Names of position players to leave out of dataset
names = "".join(["Blandino, Alex,León, Sandy,Sogard, Eric,Astudillo, Willians,Peterson, Jace,Torreyes, Ronald,Mercedes, Yermín,",
"Castro, Harold,Peralta, David,Almora Jr., Albert,Mejía, Francisco,Drury, Brandon,Tauchman, Mike,Burns, Andy,",
"Phillips, Brett,Eaton, Adam,Bemboom, Anthony,Ruf, Darin,Maile, Luke,Tom, Ka'ai,Luplow, Jordan,Plawecki, Kevin,",
"Rogers, Jake,Reddick, Josh,Moreland, Mitch,Cabrera, Asdrúbal,Guillorme, Luis,Holaday, Bryan,Pérez, Hernán,",
"Wilkerson, Stevie,Arroyo, Christian,Rizzo, Anthony,Romine, Andrew,Mendick, Danny,Schrock, Max,",
"Culberson, Charlie,Araúz, Jonathan,Gonzalez, Marwin,Turner, Justin,Robertson, Daniel,Holt, Brock,",
"Mathisen, Wyatt,Wynns, Austin,Lopez, Nicky,Moran, Brian,Owings, Chris,Reyes, Pablo,Difo, Wilmer,",
"Fuentes, Joshua,Valaika, Pat,Cronenworth, Jake,Pillar, Kevin,Slater, Austin,Evans, Phillip,Duffy, Matt,",
"Alcántara, Sergio,Knapp, Andrew,Freeman, Mike,Gonzalez, Romy,Maton, Nick,Mayfield, Jack,Alberto, Hanser,",
"Clemens, Kody,González, Luis,Gordon, Nick,VanMeter, Josh,Bethancourt, Christian,Brosseau, Mike,Myers, Wil,",
"Clement, Ernie,Schwindel, Frank,Molina, Yadier,Taylor, Michael A.,Neuse, Sheldon,Strange-Gordon, Dee,",
"McKenna, Ryan,Bradley Jr., Jackie,Pujols, Albert,Freeman, Caleb,Pinder, Chad,Chang, Yu,Batten, Matthew,",
"Simmons, Andrelton,Escobar, Alcides,Dozier, Hunter,Walton, Donovan,Reynolds, Matt,Torrens, Luis,Dickerson, Corey,",
"Barnhart, Tucker,Caratini, Victor,Merrifield, Whit,Adrianza, Ehire,Knizner, Andrew,Grichuk, Randal,Serven, Brian,",
"Palacios, Jermaine,McGuire, Reese,Vargas, Ildemaro,Hall, Darick,Gosselin, Phil,Nogowski, John,Stubbs, Garrett,",
"Sánchez, Yolmer,Arcia, Orlando,Ford, Mike,Hager, Jake,Harrison, Josh,Kelly, Carson,Lopez, Alejo,",
"Carpenter, Matt,Reyes, Franmil,García, Robel"]).split(',')
# Join concatted string into array of names as they appear in Statcast name column
pos_players = [x+ ',' +y for x,y in zip(names[0::2], names[1::2])]
# Group by statements to find # pitches and velocity numbers for position players
data21_grps = data2021.groupby('player_name').agg(max_velo =('release_speed', np.max),
med_velo =('release_speed', np.median),
avg_velo =('release_speed', np.mean),
num_pitches = ('player_name', np.count_nonzero))
data22_grps = data2022.groupby('player_name').agg(max_velo =('release_speed', np.max),
med_velo =('release_speed', np.median),
avg_velo =('release_speed', np.mean),
num_pitches = ('player_name', np.count_nonzero))
data21_grps['Max-Avg Diff'] = data21_grps['max_velo'] - data21_grps['avg_velo']
data22_grps['Max-Avg Diff'] = data22_grps['max_velo'] - data22_grps['avg_velo']
# Filter out to find all position players
#data21_grps[((data21_grps.num_pitches <= 200) & (data21_grps['med_velo'] < 85)) |
# ((data21_grps.num_pitches <= 200) & (data21_grps['Max-Avg Diff'] > 12))|
# ((data21_grps.num_pitches <= 200) & (data21_grps['max_velo'] < 90))]
#data22_grps[((data22_grps.num_pitches <= 200) & (data22_grps['med_velo'] < 85)) |
# ((data22_grps.num_pitches <= 200) & (data22_grps['Max-Avg Diff'] > 12))|
# ((data21_grps.num_pitches <= 200) & (data21_grps['max_velo'] < 90))]
#(Recursively) Eliminate position players from dataset, update dataset
#data21 = data2021[~data2021.player_name.isin(pos_players)]
#data22 = data2022[~data2022.player_name.isin(pos_players)]
#data22.to_csv('statcast21.csv',index=False)
#data21.to_csv('statcast22.csv',index=False)
# Define a function to fill NaN values within groups
def fillna_by_pitcher(df, cols):
'''
Description: Fills NA values (pitch metrics), applied by pitch type per pitcher
--------------------------------------------------------------------------------
Inputs: df, cols
Returns: df
NA columns filled
'''
# For each column, take mean of column within dataframe, fill NA values with mean
for i in cols:
mean = df[i].mean()
df[i].fillna(mean,inplace = True)
return df
def clean_train_data(df):
'''
Description: Cleans training data, filters dataframe for relevant features,
removes non-pitches and fills in NA values for each unique pitch for all pitchers
--------------------------------------------------------------------------------
Inputs: df
Returns: df_clean
Cleaned input df
'''
# Define relevant feature columns, values to remove, columns with NA values to fill
non_pitches = ['FA','PO']
y = ['delta_run_exp']
context_features = ['player_name','p_throws','batter','stand','pitch_type','pitch_number',
'home_team','game_date','game_pk','at_bat_number',
'balls','strikes', 'outs_when_up','on_3b', 'on_2b', 'on_1b']
cont_features = ['release_speed','release_extension','effective_speed','release_spin_rate',
'release_pos_x', 'release_pos_y', 'release_pos_z','spin_axis', 'pfx_x', 'pfx_z',
'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot']
features = y + context_features + cont_features
# Drop: game_pk, player_name, batter, game_date
# Filter Dataframe for features
df = df[features]
# Remove pitchouts/non-pitches, pitches with 0 movement (Statcast errors)
df_filt = df[(~df.pitch_type.isin(non_pitches)) & ((df.pfx_x != 0.0) & (df.pfx_z != 0.0))]
# Define columns to fill or drop if NA
fill_cols = ['release_speed','release_extension','effective_speed','release_spin_rate',
'release_pos_x','release_pos_y','release_pos_z','spin_axis']
na_cols = ['delta_run_exp','pitch_type','pitch_number','pfx_x','pfx_z',
'release_pos_x', 'release_pos_y', 'release_pos_z',
'release_speed','release_extension','effective_speed','release_spin_rate',
'spin_axis','sz_top', 'sz_bot']
# Fill in NA values for each pitch and pitcher with mean of each column for each unique
df_filled = df_filt.groupby(['player_name','pitch_type']).apply(fillna_by_pitcher, cols = fill_cols)
df_clean = df_filled.dropna(subset=na_cols)
# OHC Base variables to 0 and 1
df_clean[['on_1b','on_2b','on_3b']] = df_clean[['on_1b','on_2b','on_3b']].notna().astype(int)
# Sirt dataframe by pitches in chronological order, return
df_clean = df_clean.sort_values(['game_date','game_pk','at_bat_number','pitch_number'])
return df_clean
def add_new_features(df, season_start):
'''
Description: Adds new features in dataframe
- inferred_axis: Inferred Spin Axis (SSW Effects)
- axis_diff: Difference of Inferred and Observed Spin Axis
- game_week: Change game date to week of season depending on start date of season
- pitch_count: Pitch # of outing for each outing per pitcher
--------------------------------------------------------------------------------
Inputs: df, season_start (str)
Returns: df
Dataframe with new features added
'''
# inferred_axis: 180 / pi * atan(pfx_z / pfx_x) + 90 (where pfx_x is < 0, add 180 degrees.)
df['inferred_axis'] = np.degrees(np.arctan(df['pfx_z'] / df['pfx_x'])) + 90
df.loc[df['pfx_x'] < 0, 'inferred_axis'] += 180
df['axis_diff'] = df['spin_axis'] - df['inferred_axis']
# axis_diff: spin_axis - inferred_axis
# Pitch Count: Cumulative pitch number of outing for pitcher
df['pitch_count'] = df.sort_values(
['game_date','game_pk','at_bat_number','pitch_number']).groupby(
['game_date','game_pk','player_name']).cumcount() + 1
# Create game_week column, where week of season is taken from game_date in Savant
start_date = datetime.strptime(season_start, '%Y-%m-%d').date()
df['datetime'] = pd.to_datetime(df['game_date'])
df['game_week'] = df.apply(lambda x: ((x['datetime'].date() - start_date).days // 7) + 1, axis = 1)
df = df.drop('datetime',axis=1)
return df
data21 = clean_train_data(data2021)
data22 = clean_train_data(data2022)
data21 = add_new_features(data21,'2021-04-01')
data22 = add_new_features(data22,'2022-03-31')
def plot_lineplot(dataset, xvar, yvar, hue_var):
'''
Description: Plots lineplot, adds labels of hue variable at end of each line
--------------------------------------------------------------------------------
Inputs: dataset, x_var, y_var, hue_var
Returns: None
'''
plt.figure(figsize = (20,10))
plot = sns.lineplot(data = dataset, x = xvar,y= yvar,hue = hue_var, palette = 'tab10')
# Add labels to the lines, each at the last x-value and corresponding y-value
for line in plot.lines:
x, y = line.get_data()
if len(x) == 0 and len(y) == 0:
pass
else:
label = dataset[dataset[yvar] == y[-1]][hue_var].values[-1]
plt.text(x[-1], y[-1], label, ha='right', va='bottom', fontsize=12, color=line.get_color())
def get_aggregate_count(data, player, year, plot = True):
'''
Description: Aggregates data for all pitch_types, cumulative count for each date,
plots lineplot of aggregate data with labels
--------------------------------------------------------------------------------
Inputs: data (dataframe), player (str), year (str/int), plot (boolean, default True)
Returns: pitch_aggs (dataframe)
'''
# Reverses order of data, changes date to datetime type instead of str
data = data[::-1].reset_index(drop=True)
data['game_date'] = pd.to_datetime(data['game_date'])
# Groups input data by pitch_type and game_date for each date
# (includes 0 for pitches not thrown on a specific day)
pitch_aggs = pd.DataFrame(data.groupby(['pitch_type','game_date']).size().unstack(
fill_value=0)).reset_index().melt(id_vars='pitch_type', var_name='game_date', value_name='count')
# Cumulatively sums pitch type counts ober each unique day
pitch_aggs['Cumulative_Count'] = pitch_aggs.groupby('pitch_type')['count'].cumsum()
# Plots lineplot for data if plot set to true
if plot:
plot_lineplot(pitch_aggs, 'game_date', 'Cumulative_Count', 'pitch_type')
# Format the x-axis date labels (optional)
plt.xticks(rotation=45)
# Set labels and title
plt.xlabel('Date')
plt.ylabel('Aggregate Count')
plt.title(f'Aggregate Counts by Pitch Type by Game Date, {player}, {year}', fontsize=16)
# Show the legend
plt.legend(title='Pitch Type',loc="upper left")
# Display the plot
plt.show()
return pitch_aggs
def get_cumulative_freq(agg_data, player, year, plot = True):
'''
Description: Aggregates data for all pitch_types' cumulative relative frequency for each date,
plots lineplot of aggregate data with labels
--------------------------------------------------------------------------------
Inputs: agg_data (dataframe), player (str), year (str/int), plot (boolean, default True)
Returns: pitch_aggs (dataframe)
'''
# For each game_date, cumulatively sum number of pitches thrown to find relative frequencies
pitch_total_sum = agg_data.groupby('game_date')['count'].sum().cumsum().reset_index(name = 'season_total')
# Merges input data (of cumulative count for pitch types), finds cumulative relative frequency
# for each game_date
merged_freq_df = agg_data.merge(pitch_total_sum, on = 'game_date',how='left')
merged_freq_df['Cumulative_Rel_Freq'] = merged_freq_df['Cumulative_Count'] / merged_freq_df['season_total']
# Plots data on lineplot if plot set to True
if plot:
plot_lineplot(merged_freq_df, 'game_date', 'Cumulative_Rel_Freq','pitch_type')
plt.xticks(rotation=45)
# Set labels and title
plt.xlabel('Date')
plt.ylabel('Cumulative Relative Frequency')
plt.title(f'Cumulative Relative Frequency of Pitch Type, {player}, {year}', fontsize=16)
# Show the legend
plt.legend(title='Pitch Type',loc="upper left")
# Display the plot
plt.show()
return merged_freq_df
all21_aggregates = get_aggregate_count(data21, 'All', 2021)
all21_rel_freq = get_cumulative_freq(all21_aggregates, 'All', 2021)
all22_aggregates = get_aggregate_count(data22, 'All', 2022)
all22_rel_freq = get_cumulative_freq(all22_aggregates, 'All', 2022)
data21_rhb = data21[data21.stand == 'R']
data21_rhb_agg = get_aggregate_count(data21_rhb, 'All RHB', 2021, False)
data21_rhb_cumul = get_cumulative_freq(data21_rhb_agg, 'All RHB', 2021)
data21_lhb = data21[data21.stand == 'L']
data21_lhb_agg = get_aggregate_count(data21_lhb, 'All LHB', 2021, False)
data21_lhb_cumul = get_cumulative_freq(data21_lhb_agg, 'All LHB', 2021)
def get_relative_freq(data, player, year, plot = True):
'''
Description: Aggregates data for all pitch_types, finds relative frequency for each pitch for each date,
plots lineplot of aggregate data with labels
--------------------------------------------------------------------------------
Inputs: data (dataframe), player (str), year (str/int), plot (boolean, default True)
Returns: pitch_aggs (dataframe)
'''
# Reorders data, changes game_date to type datetime
data = data[::-1].reset_index(drop=True)
data['game_date'] = pd.to_datetime(data['game_date'])
# Aggregate, Reshape the DataFrame using melt
rel_pitch_aggs = pd.DataFrame(data.groupby(['pitch_type','game_date']).size().unstack(
fill_value=0)).reset_index().melt(id_vars='pitch_type', var_name='game_date', value_name='count')
# Sort the DataFrame by 'pitch_type' and 'game_date'
rel_aggs_reshaped = rel_pitch_aggs.sort_values(by=['pitch_type', 'game_date']).reset_index(drop=True)
# Get total pitches thrown for all game_dates
total_pitches = data.groupby('game_date')['pitch_type'].count().reset_index(name='total')
# Merge dataframes, calculate relative pitch frequency per day
merged_df = rel_aggs_reshaped.merge(total_pitches, on='game_date', how='left')
merged_df['Relative_Freq'] = merged_df['count'] / merged_df['total']
# Plots lineplot if plot set to True
if plot:
plot_lineplot(merged_df, 'game_date','Relative_Freq','pitch_type')
# Format the x-axis date labels (optional)
plt.xticks(rotation=45)
# Set labels and title
plt.xlabel('Date')
plt.ylabel('Relative Frequency')
plt.title(f'Relative Pitch Frequencies by Pitch Type per Game Date, {player}, {year}', fontsize=16)
# Show the legend
plt.legend(title='Pitch Type',loc="upper left")
# Display the plot
plt.show()
return merged_df
all21_relative = get_relative_freq(data21, 'All','2021')
all22_relative = get_relative_freq(data22, 'All','2022')
test_data = data21.copy()[::-1].reset_index(drop=True)
ohtani = test_data[test_data.player_name == 'Ohtani, Shohei']
ohtani_agg21 = get_aggregate_count(ohtani, 'Shohei Ohtani','2021')
ohtani21_cumul_freq = get_cumulative_freq(ohtani_agg21, 'Shohei Ohtani', 2021)
ohtani_rel21 = get_relative_freq(ohtani, 'Shohei Ohtani','2021')
wheeler = test_data[test_data.player_name == 'Wheeler, Zack']
wheeler_agg21 = get_aggregate_count(wheeler, 'Zack Wheeler','2021', False)
wheeler_cumul_freq21 = get_cumulative_freq(wheeler_agg21,'Zack Wheeler', 2021)
wheeler_rel21 = get_relative_freq(wheeler, 'Zack Wheeler',' 2021')
wheeler_lhb21 = wheeler[wheeler.stand == 'L']
wheeler_rhb21 = wheeler[wheeler.stand == 'R']
wheeler_rhb_agg21 = get_aggregate_count(wheeler_rhb21, 'Zack Wheeler (RHB)','2021', False)
wheeler_rhb21_cumul = get_cumulative_freq(wheeler_rhb_agg21,'Zack Wheeler (RHB)', 2021)
wheeler_lhb_agg21 = get_aggregate_count(wheeler_lhb21, 'Zack Wheeler (LHB)','2021', False)
wheeler_lhb21_cumul = get_cumulative_freq(wheeler_lhb_agg21,'Zack Wheeler (LHB)', 2021)
# Get cumulative number pitches per AB for all unique ABs
pitch_counts_ab = data21.sort_values(
['game_date','game_pk','at_bat_number','pitch_number']).groupby(
['game_date','game_pk','player_name','at_bat_number']).cumcount() + 1
# Get cumulative number pitches per AB for each unique pitch, AB
pitch_type_counts_ab = data21.sort_values(
['game_date','game_pk','at_bat_number','pitch_number']).groupby(
['game_date','game_pk','player_name','at_bat_number','pitch_type']).cumcount() + 1
# Plot histogram of each pitch's cumulative frequency within each unique AB
sns.histplot(pitch_type_counts_ab/pitch_counts_ab)
plt.title('Cumulative Relative Frequency of Pitches per AB per Game')
Text(0.5, 1.0, 'Cumulative Relative Frequency of Pitches per AB per Game')
# Get cumulative number pitches per AB for all unique batter v. pitcher matchup per game
pitch_counts_hitvpitch = data21.sort_values(
['game_date','game_pk','at_bat_number','pitch_number']).groupby(
['game_date','game_pk','player_name','batter']).cumcount() + 1
# Get cumulative number pitches per AB for all unique batter v. pitcher matchup per game for each unique pitch
pitch_type_counts_hitvpitch = data21.sort_values(
['game_date','game_pk','at_bat_number','pitch_number']).groupby(
['game_date','game_pk','player_name','batter','pitch_type']).cumcount() + 1
# Plot histogram of each pitch's cumulative frequency within each unique batter v. pitcher matchup per game
sns.histplot(pitch_type_counts_hitvpitch/pitch_counts_hitvpitch)
plt.title('Cumulative Relative Frequency of Pitches per Hitter v. Pitcher Matchup per Game')
Text(0.5, 1.0, 'Cumulative Relative Frequency of Pitches per Hitter v. Pitcher Matchup per Game')
# Get cumulative number pitches for all unique pitcher outings (start, relief)
pitch_counts = data21.sort_values(
['game_date','game_pk','at_bat_number','pitch_number']).groupby(
['game_date','game_pk','player_name']).cumcount() + 1
# Get cumulative number pitches for all unique pitcher outings, for each unique pitch
pitch_type_counts = data21.sort_values(
['game_date','game_pk','at_bat_number','pitch_number']).groupby(
['game_date','game_pk','player_name','pitch_type']).cumcount() + 1
# Plot cumulative relative frequency for all pitches for each unique pitcher appearance
sns.histplot(pitch_type_counts/pitch_counts)
plt.title('Cumulative Relative Frequency of Pitches per Outing')
Text(0.5, 1.0, 'Cumulative Relative Frequency of Pitches per Outing')
# Groups data by each game_date and pitcher, finds primary fastball (or primary pitch if no fastball)
primary_fb = data21.groupby(['game_date','game_pk','player_name']).agg({
'pitch_type': lambda x: x[x.isin(['FC','SI','FF'])].value_counts().idxmax()
if any(x.isin(['FC','SI','FF']))
else x.value_counts().idxmax()
}).rename(columns={'pitch_type':'primary_pitch'})
# Merges training data with primary fastball, defines new columns of primary_pitch
primary_fb_data = data21.merge(primary_fb.reset_index(), left_on=['game_date','game_pk','player_name','pitch_type'],
right_on=['game_date','game_pk','player_name','primary_pitch'], how='inner')
primary_fb_data
| delta_run_exp | player_name | p_throws | batter | stand | pitch_type | pitch_number | home_team | game_date | game_pk | ... | ax | ay | az | sz_top | sz_bot | inferred_axis | axis_diff | pitch_count | game_week | primary_pitch | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.038 | Márquez, Germán | R | 605141 | R | FF | 1 | COL | 2021-04-01 | 634615 | ... | -4.967683 | 25.527820 | -18.815090 | 3.46 | 1.76 | 196.542983 | 21.623025 | 1 | 1 | FF |
| 1 | -0.049 | Márquez, Germán | R | 605141 | R | FF | 2 | COL | 2021-04-01 | 634615 | ... | -4.405496 | 25.441128 | -19.766641 | 3.29 | 1.49 | 195.859366 | -0.859366 | 2 | 1 | FF |
| 2 | 0.052 | Márquez, Germán | R | 605141 | R | FF | 3 | COL | 2021-04-01 | 634615 | ... | -6.331036 | 25.767684 | -20.177597 | 3.34 | 1.64 | 206.029592 | 1.970408 | 3 | 1 | FF |
| 3 | 0.113 | Márquez, Germán | R | 605141 | R | FF | 4 | COL | 2021-04-01 | 634615 | ... | -1.810341 | 24.856947 | -23.067890 | 3.29 | 1.58 | 185.042451 | 33.123557 | 4 | 1 | FF |
| 4 | -0.078 | Márquez, Germán | R | 605141 | R | FF | 5 | COL | 2021-04-01 | 634615 | ... | -3.312501 | 24.833759 | -20.112617 | 3.22 | 1.55 | 191.181754 | 20.818246 | 5 | 1 | FF |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 332042 | -0.061 | Gsellman, Robert | R | 594807 | R | SI | 3 | ATL | 2021-10-03 | 632254 | ... | -18.151571 | 31.755423 | -19.628348 | 3.41 | 1.56 | 236.901606 | -15.901606 | 13 | 27 | SI |
| 332043 | -0.038 | Smith, Will | L | 641645 | L | FF | 1 | ATL | 2021-10-03 | 632254 | ... | 10.379363 | 29.462506 | -13.261773 | 3.31 | 1.55 | 153.279254 | -7.279254 | 1 | 27 | FF |
| 332044 | 0.026 | Smith, Will | L | 607043 | L | FF | 1 | ATL | 2021-10-03 | 632254 | ... | 8.211265 | 31.770872 | -13.621914 | 3.65 | 1.60 | 159.573975 | -14.573975 | 4 | 27 | FF |
| 332045 | -0.189 | Smith, Will | L | 607043 | L | FF | 2 | ATL | 2021-10-03 | 632254 | ... | 8.700586 | 30.117690 | -15.941174 | 3.49 | 1.60 | 153.794165 | -10.794165 | 5 | 27 | FF |
| 332046 | -0.073 | Smith, Will | L | 596019 | R | FF | 4 | ATL | 2021-10-03 | 632254 | ... | 9.708393 | 26.562803 | -14.083224 | 3.32 | 1.51 | 153.434949 | -5.434949 | 9 | 27 | FF |
332047 rows × 40 columns
# Define velocity, movement variables to calculate differentials from primary pitch
velo_mvt_cols = ['release_speed','release_spin_rate','pfx_x', 'pfx_z',
'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis', 'inferred_axis','axis_diff']
# Calculates mean for velocity, movement variables for each primary pitch for each pitcher per outing
primary_fb_means = primary_fb_data.groupby(['game_date','game_pk','player_name','pitch_type']).agg({
i: 'mean' for i in velo_mvt_cols}).rename(columns = {
i: i + '_mean' for i in velo_mvt_cols
}).reset_index()
primary_fb_means = primary_fb_means.rename(columns={'pitch_type':'primary_pitch'})
# Merges training data with primary pitch mean data
data21_merged = data21.merge(primary_fb_means, on = ['game_date','game_pk','player_name'], how = 'inner')
# Calculates all differentials for velocity and movenent profiles
data21_merged['velo_diff'] = data21_merged['release_speed'] - data21_merged['release_speed_mean']
data21_merged['spin_rate_diff'] = data21_merged['release_spin_rate'] - data21_merged['release_spin_rate_mean']
data21_merged['pfx_x_diff'] = data21_merged['pfx_x'] - data21_merged['pfx_x_mean']
data21_merged['pfx_z_diff'] = data21_merged['pfx_z'] - data21_merged['pfx_z_mean']
data21_merged['vx0_diff'] = data21_merged['vx0'] - data21_merged['vx0_mean']
data21_merged['vy0_diff'] = data21_merged['vy0'] - data21_merged['vy0_mean']
data21_merged['vz0_diff'] = data21_merged['vz0'] - data21_merged['vz0_mean']
data21_merged['ax_diff'] = data21_merged['ax'] - data21_merged['ax_mean']
data21_merged['ay_diff'] = data21_merged['ay'] - data21_merged['ay_mean']
data21_merged['az_diff'] = data21_merged['az'] - data21_merged['az_mean']
data21_merged['spin_axis_diff'] = data21_merged['spin_axis'] - data21_merged['spin_axis_mean']
data21_merged['inferred_axis_diff'] = data21_merged['inferred_axis'] - data21_merged['inferred_axis_mean']
data21_merged['axis_diff_diff'] = data21_merged['axis_diff'] - data21_merged['axis_diff_mean']
# Drops all primary pitch velocity, movement mean columns
data21_merged = data21_merged.drop(['primary_pitch'] + [i + '_mean' for i in velo_mvt_cols],axis = 1)
data21_merged
| delta_run_exp | player_name | p_throws | batter | stand | pitch_type | pitch_number | home_team | game_date | game_pk | ... | pfx_z_diff | vx0_diff | vy0_diff | vz0_diff | ax_diff | ay_diff | az_diff | spin_axis_diff | inferred_axis_diff | axis_diff_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.038 | Márquez, Germán | R | 605141 | R | FF | 1 | COL | 2021-04-01 | 634615 | ... | 0.1714 | 0.042215 | 0.108148 | 3.347021 | 1.119505 | 0.065464 | 1.588882 | 10.479368 | -7.466122 | 17.945490 |
| 1 | -0.049 | Márquez, Germán | R | 605141 | R | FF | 2 | COL | 2021-04-01 | 634615 | ... | 0.0414 | 0.507318 | -0.567653 | -0.268293 | 1.681692 | -0.021229 | 0.637331 | -12.686640 | -8.149739 | -4.536901 |
| 2 | 0.052 | Márquez, Germán | R | 605141 | R | FF | 3 | COL | 2021-04-01 | 634615 | ... | 0.0214 | -1.891575 | -1.366921 | 1.789345 | -0.243847 | 0.305327 | 0.226375 | 0.313360 | 2.020487 | -1.707127 |
| 3 | 0.113 | Márquez, Germán | R | 605141 | R | FF | 4 | COL | 2021-04-01 | 634615 | ... | -0.1586 | 0.030702 | -0.292753 | 3.321251 | 4.276848 | -0.605410 | -2.663917 | 10.479368 | -18.966654 | 29.446022 |
| 4 | -0.078 | Márquez, Germán | R | 605141 | R | FF | 5 | COL | 2021-04-01 | 634615 | ... | 0.0214 | -0.282896 | -1.062003 | 0.651255 | 2.774688 | -0.628598 | 0.291355 | 4.313360 | -12.827351 | 17.140711 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 704089 | -0.189 | Smith, Will | L | 607043 | L | FF | 2 | ATL | 2021-10-03 | 632254 | ... | -0.1150 | 0.861282 | 1.087932 | 0.157783 | -0.549316 | 0.639222 | -1.714153 | -2.500000 | -1.226421 | -1.273579 |
| 704090 | 0.016 | Smith, Will | L | 596019 | R | CU | 1 | ATL | 2021-10-03 | 632254 | ... | -2.0850 | 9.105589 | 24.382641 | 8.922196 | -15.152558 | -10.050905 | -24.057726 | 184.500000 | 159.979414 | 24.520586 |
| 704091 | -0.020 | Smith, Will | L | 596019 | R | CU | 2 | ATL | 2021-10-03 | 632254 | ... | -1.9050 | 4.752577 | 23.686689 | 5.488884 | -14.122826 | -8.876134 | -22.035163 | 182.500000 | 153.097690 | 29.402310 |
| 704092 | -0.027 | Smith, Will | L | 596019 | R | SL | 3 | ATL | 2021-10-03 | 632254 | ... | -0.9150 | 3.029527 | 16.157586 | 3.377736 | -15.781025 | -9.685077 | -13.142093 | 169.500000 | 83.040965 | 86.459035 |
| 704093 | -0.073 | Smith, Will | L | 596019 | R | FF | 4 | ATL | 2021-10-03 | 632254 | ... | -0.0150 | -0.103284 | -0.578015 | -0.717265 | 0.458491 | -2.915664 | 0.143797 | 2.500000 | -1.585637 | 4.085637 |
704094 rows × 52 columns
# For "noise" around pitch trajectory, calculate a multivariate normal distribution
# for each unique pitch thrown for each pitcher over a season
# Note: Calculating multivariate distributions for eachn pitch per game is
# both extremely computationally intensive, and each distribution unstable due to small samples
# of each pitch per game
# Note: Does not include axis_diff for this iteration
# Define all continuous features
cont_feats = ['release_speed','release_extension','effective_speed',
'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis','inferred_axis','axis_diff']
def multivariate_normal_distribution(x):
'''
Description: Applied to each group, calculate multivariate normal distribution
for each row's continuous features with mean and covariance matrix
--------------------------------------------------------------------------------
Inputs: x (dataframe row, Series)
Returns: mvn_dist, SciPy multivariate normal distribution
'''
# Extract the continuous variables
continuous_vars = x[cont_feats]
# Calculate the mean and covariance matrix for the continuous variables
mean = continuous_vars.mean()
cov_matrix = continuous_vars.cov().fillna(0) + (np.eye(continuous_vars.cov().shape[0]) * 1e-6)
# Create a multivariate normal distribution object
mvn_dist = multivariate_normal(mean=mean, cov=cov_matrix, allow_singular=True)
return mvn_dist
# Applies multivariate normal to all unique pitches for each pitcher per year(> 3000 pitches per year)
pitch_noise_groups = data21.groupby(['player_name','pitch_type']).apply(
multivariate_normal_distribution).reset_index(name = 'MV_Dist')
def calc_mahalanobis(x):
'''
Description: Calculates mahalanobis distance of each pitch's continuous features
from center, inverse covariance matrix of distribution
--------------------------------------------------------------------------------
Inputs: x (type Series)
Returns: mahalanobis_distance (type float)
'''
# Defines distribution, continuous features
distribution = x[-1]
data = np.array(x[:-1])
# Calculates distance
mahalanobis_distance= mahalanobis(data, distribution.mean, np.linalg.inv(distribution.cov))
return mahalanobis_distance
# Merges dataframes of training data, dataframe with multivariate distributions,
# so each unique pitch's distribution included in column for each pitch in training data
data21_merged2 = data21.merge(pitch_noise_groups, on = ['player_name','pitch_type'], how = 'inner')
# Calculate mahalanobis distance for all unique pitch's continuous features based on
# center and inverse covariance matrix of each pitch's multivariate distribution
data21_merged2['mahalanobis'] = data21_merged2[cont_feats + ['MV_Dist']].apply(
calc_mahalanobis, axis = 1)
# Drops all multivariate normal distributions
data21_merged2 = data21_merged2.drop('MV_Dist',axis = 1)
data21_merged2
| delta_run_exp | player_name | p_throws | batter | stand | pitch_type | pitch_number | home_team | game_date | game_pk | ... | ax | ay | az | sz_top | sz_bot | inferred_axis | axis_diff | pitch_count | game_week | mahalanobis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.038 | Márquez, Germán | R | 605141 | R | FF | 1 | COL | 2021-04-01 | 634615 | ... | -4.967683 | 25.527820 | -18.815090 | 3.46 | 1.76 | 196.542983 | 21.623025 | 1 | 1 | 3.752184 |
| 1 | -0.049 | Márquez, Germán | R | 605141 | R | FF | 2 | COL | 2021-04-01 | 634615 | ... | -4.405496 | 25.441128 | -19.766641 | 3.29 | 1.49 | 195.859366 | -0.859366 | 2 | 1 | 10.473990 |
| 2 | 0.052 | Márquez, Germán | R | 605141 | R | FF | 3 | COL | 2021-04-01 | 634615 | ... | -6.331036 | 25.767684 | -20.177597 | 3.34 | 1.64 | 206.029592 | 1.970408 | 3 | 1 | 2.909655 |
| 3 | 0.113 | Márquez, Germán | R | 605141 | R | FF | 4 | COL | 2021-04-01 | 634615 | ... | -1.810341 | 24.856947 | -23.067890 | 3.29 | 1.58 | 185.042451 | 33.123557 | 4 | 1 | 4.861900 |
| 4 | -0.078 | Márquez, Germán | R | 605141 | R | FF | 5 | COL | 2021-04-01 | 634615 | ... | -3.312501 | 24.833759 | -20.112617 | 3.22 | 1.55 | 191.181754 | 20.818246 | 5 | 1 | 3.393314 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 704089 | 0.000 | Domínguez, Seranthony | R | 571918 | R | SL | 5 | MIA | 2021-10-03 | 632246 | ... | 4.254631 | 26.821952 | -30.382886 | 3.37 | 1.53 | 103.781597 | -0.781597 | 7 | 27 | 1.500000 |
| 704090 | -0.114 | Domínguez, Seranthony | R | 571918 | R | SL | 6 | MIA | 2021-10-03 | 632246 | ... | 4.214959 | 25.765611 | -31.283437 | 3.37 | 1.53 | 96.458816 | -22.458816 | 8 | 27 | 1.500000 |
| 704091 | 0.024 | Domínguez, Seranthony | R | 663743 | R | SL | 2 | MIA | 2021-10-03 | 632246 | ... | 7.495288 | 26.630297 | -32.724664 | 3.32 | 1.56 | 81.304497 | 22.695503 | 10 | 27 | 1.500000 |
| 704092 | 0.043 | Domínguez, Seranthony | R | 663743 | R | SL | 5 | MIA | 2021-10-03 | 632246 | ... | 5.347618 | 27.557519 | -30.424209 | 3.28 | 1.56 | 96.766175 | 8.233825 | 13 | 27 | 1.500000 |
| 704093 | 0.038 | Alexander, Tyler | L | 660162 | R | CU | 1 | CWS | 2021-10-03 | 632252 | ... | -1.969161 | 19.192514 | -34.745736 | 3.40 | 1.54 | 317.121096 | -42.121096 | 68 | 27 | 0.000000 |
704094 rows × 40 columns
name = 'Wheeler, Zack'
wheeler_21 = data21_merged2[data21_merged2.player_name == name]
sns.histplot(
wheeler_21, x="mahalanobis", y="pitch_type", hue="pitch_type", legend=False
)
plt.title(f'Mahalanobis Distance by Pitch Type, Zack Wheeler 2021')
Text(0.5, 1.0, 'Mahalanobis Distance by Pitch Type, Zack Wheeler 2021')
name = 'Kershaw, Clayton'
kershaw_21 = data21_merged2[data21_merged2.player_name == name]
sns.histplot(
kershaw_21, x="mahalanobis", y="pitch_type", hue="pitch_type", legend=False
)
plt.title(f'Mahalanobis Distance by Pitch Type, Clayton Kershaw 2021')
Text(0.5, 1.0, 'Mahalanobis Distance by Pitch Type, Clayton Kershaw 2021')
name = 'Jansen, Kenley'
jansen_21 = data21_merged2[data21_merged2.player_name == name]
sns.histplot(
jansen_21, x="mahalanobis", y="pitch_type", hue="pitch_type", legend=False
)
plt.title(f'Mahalanobis Distance by Pitch Type, Kenley Jansen 2021')
Text(0.5, 1.0, 'Mahalanobis Distance by Pitch Type, Kenley Jansen 2021')
# Define column transformer for different scalers
pca_scale_features = ['release_speed','release_extension','effective_speed',
'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
'spin_axis','axis_diff', 'inferred_axis', 'pfx_x', 'pfx_z',
'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot']
#pca_cat_features = ['p_throws','stand','pitch_number',
# 'home_team','game_week','balls','strikes','outs_when_up']
pca_preprocessor = ColumnTransformer(
transformers=[
('standard_scale', StandardScaler(), pca_scale_features), # StandardScaler for continuous features
#('categorical', OneHotEncoder(), pca_cat_features) # OneHotEncoder for categorical features
])
# Define PCA as TruncatedSVD
pca_alldata = TruncatedSVD(n_components=10, random_state = 15)
# Create pipeline
pca_pipeline_alldata = Pipeline(steps=[
('preprocessor', pca_preprocessor),
('pca', pca_alldata)
])
# Concatenate all training data
# Apply PCA, plot variance for each of 10 principal components
all_data = pd.concat([data21, data22])
pca_pipeline_alldata.fit_transform(all_data)
# Plot the cumulative explained variance
variance = pca_pipeline_alldata.named_steps['pca'].explained_variance_ratio_
plt.figure(figsize=(10, 5))
plt.bar(range(1, len(variance) + 1), variance)
plt.title('Elbow Plot, PCA Training Data')
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance')
plt.grid(True)
plt.show()
def create_scatterplot_subplot(input_data, x_vars, y_vars, hue_var, title = None, inp_ax = None):
"""
Create a scatterplot with colors for categorical variables.
Available to do multiple columns in a subplot or add a single plot to an existing figure.
Parameters:
df (DataFrame): The input DataFrame containing the data. (Filtered)
x_vars (list of str): List of three variables for the x-axis.
y_vars (list of str): List of three variables for the y-axis.
hue_var (str): The variable for coloring the points.
Returns: None
"""
# plot single scatterplot or multiple depending on number of variables passed
if len(x_vars) == 1:
sns.scatterplot(data= input_data, x=x_vars[0],
y=y_vars[0], hue=hue_var, palette='Set1', ax = inp_ax)
if inp_ax != None:
inp_ax.set_title(title, fontsize=12)
else:
num_cols = len(x_vars)
num_rows = 1
fig, axe = plt.subplots(num_rows, num_cols, figsize=(4*num_cols, 4))
for i, (x_var, y_var) in enumerate(zip(x_vars, y_vars)):
sns.scatterplot(data =input_data, x=x_var, y=y_var, hue=hue_var, palette='Set1',ax = axe[i])
axe[i].set_title(f'{x_var} vs {y_var}')
axe[i].set_xlabel(x_var)
axe[i].set_ylabel(y_var)
def cluster_pitchdata(name, pitch_data, df_out):
"""
Do clustering analysis of pitch types for each unique pitcher,
Determine appropriate # of clusters, choose to add new classifications or keep old ones
Parameters:
name (str): Name of pitcher
pitch_data (pandas DataFrame): DataFrame of pitch data
df_out (pandas DataFrame): DataFrame of pitch data with clustering analysis done
Returns: df_out, df with clustering analysis done
"""
# Define dataframe of pitch data for specific pitcher, ground-truth pitch classifications
df = pitch_data[pitch_data.player_name == name].reset_index(drop=True)
df_hue = df.pitch_type.reset_index(drop=True)
df = df.drop(['player_name','pitch_type'],axis=1)
#,'batter','game_pk','game_date'
# Define the column transformer for different scalers
standardscale_features = ['release_speed','release_extension','effective_speed',
'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
'spin_axis','inferred_axis','axis_diff','pfx_x', 'pfx_z',
'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot']
preprocessor = ColumnTransformer(
transformers=[
('standard_scale', StandardScaler(), standardscale_features), # StandardScaler for continuous features
])
# Fit PCA to 4 components
pca = TruncatedSVD(n_components=4, random_state = 15)
# Create pipeline, fit
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('pca', pca)
])
df_pca = pd.DataFrame(pipeline.fit_transform(df),
columns = ['PC1','PC2','PC3','PC4'])
# Plot PCA data to determine number of clusters to fit to data
xpca_vars = ['PC1'] #['PC1','PC2','PC3','PC4']
ypca_vars = ['PC2'] #['PC2','PC3','PC4','PC1']
plt.figure(figsize=(4,4))
create_scatterplot_subplot(df_pca, xpca_vars, ypca_vars, df_hue)
plt.title(f'PCA: {name}')
plt.show()
print('Before Clustering:', ', '.join([f"{index}: {count}" for index, count in df_hue.value_counts().items()]))
# Fit Gaussian Mixture Model (GMM) to the reduced data
num_clusters = int(input('Specify Number of Clusters: ')) # Specify the number of clusters
gmm = GaussianMixture(n_components=num_clusters, n_init = 30, random_state=42, max_iter = 500,
tol=1e-8,init_params = 'k-means++')
cluster_labels = gmm.fit_predict(df_pca)
# Add cluster labels back to the original data
df['pitch_type'] = cluster_labels
# Calculate cluster centroids
cluster_centroids = gmm.means_
# Calculate the centroids of ground truth labels
ground_truth_centroids = []
ground_truth_labels = np.unique(df_hue)
for label in ground_truth_labels:
centroid = np.mean(df_pca[np.array(df_hue) == label], axis=0)
ground_truth_centroids.append(centroid)
# Assign clusters to ground truth labels based on centroid mahalanobis distances
cluster_to_ground_truth_mapping, _ = pairwise_distances_argmin_min(
X= cluster_centroids, Y= ground_truth_centroids,
metric='mahalanobis',
metric_kwargs={'VI': np.linalg.inv(np.cov(cluster_centroids.T)+ np.identity(cluster_centroids.shape[1]))})
# Index ground-truth labels to cluster labels
# Cluster labels in order 0-x, cluster to ground-truth mapping in order of minimal mahalanobis distance to center
ground_to_cluster_labels = [ground_truth_labels[i] for i in cluster_to_ground_truth_mapping]
# Map indexed ground-truth labels to cluster labels
mapping_dict = {i: ground_to_cluster_labels[i] for i in range(len(ground_to_cluster_labels))}
# Map dict of cluster and ground-truth labels to dataframe
df['pitch_type'] = df['pitch_type'].map(mapping_dict)
# Print number of each pitch type after clustering
print('After Clustering: ', ', '.join([f"{index}: {count}"
for index, count in df['pitch_type'].value_counts().items()]))
# Define new figure, plot PCA data before and after clustering analysis
fig, axis = plt.subplots(1,2, figsize = (8, 4))
create_scatterplot_subplot(df_pca, xpca_vars, ypca_vars, df_hue,
title = f'PCA: {name}',inp_ax = axis[0])
create_scatterplot_subplot(df_pca, xpca_vars, ypca_vars, df['pitch_type'],
title = f'PCA (Clustered): {name}', inp_ax = axis[1])
plt.show()
# Manual component for saving/dropping clustering results from
cluster_eval = input('Is this clustering sufficient?: ')
if cluster_eval == 'y':
pass
else:
# Keep original pitch type labels
df['pitch_type'] = df_hue
df['player_name'] = name
df_out = df_out.append(df)
return df_out
#data21_clus = pd.read_csv('Clustering2021.csv')
#data22_clus = pd.read_csv('Clustering2022.csv')
new_clus = pd.DataFrame()
for i in data22.player_name.unique()[785:]:
index = np.where(i == data22.player_name.unique())[0][0]
print(index, i)
new_clus = cluster_pitchdata(i, data22, new_clus)
785 Ortiz, Luis L.
Before Clustering: FF: 118, SL: 99, SI: 41, CH: 14 Specify Number of Clusters: 4 After Clustering: FF: 159, SL: 99, CH: 14
Is this clustering sufficient?: 786 Espinal, Raynel
Before Clustering: FF: 29, CH: 29, SL: 17 Specify Number of Clusters: 3 After Clustering: FF: 29, CH: 29, SL: 17
Is this clustering sufficient?: y 787 Dowdy, Kyle
Before Clustering: CU: 38, FF: 36, FC: 21 Specify Number of Clusters: 3 After Clustering: CU: 38, FF: 36, FC: 21
Is this clustering sufficient?: y 788 Wiles, Collin
Before Clustering: FC: 63, SI: 27, FF: 24, CU: 19, SL: 15, CH: 5 Specify Number of Clusters: 6 After Clustering: FC: 63, SI: 56, SL: 24, CU: 10
Is this clustering sufficient?: 789 Jameson, Drey
Before Clustering: FF: 127, SI: 103, SL: 89, CH: 31, CU: 16 Specify Number of Clusters: 5 After Clustering: FF: 125, SL: 111, SI: 93, CH: 37
Is this clustering sufficient?: 790 Alzolay, Adbert
Before Clustering: FF: 54, SL: 46, FC: 22, SI: 18, CH: 10 Specify Number of Clusters: 6 After Clustering: FF: 72, SL: 45, FC: 23, CH: 10
Is this clustering sufficient?: 791 German, Frank
Before Clustering: FF: 58, SL: 24, CH: 2 Specify Number of Clusters: 3 After Clustering: FF: 58, SL: 26
Is this clustering sufficient?: 792 Hollowell, Gavin
Before Clustering: SL: 47, SI: 38, FF: 32, CH: 1 Specify Number of Clusters: 4 After Clustering: SL: 48, SI: 37, FF: 33
Is this clustering sufficient?: 793 Henriquez, Ronny
Before Clustering: SL: 82, CH: 53, FF: 41, SI: 3 Specify Number of Clusters: 4 After Clustering: SL: 81, CH: 54, FF: 44
Is this clustering sufficient?: t 794 Miller, Shelby
Before Clustering: SL: 48, FF: 41 Specify Number of Clusters: 2 After Clustering: SL: 48, FF: 41
Is this clustering sufficient?: y 795 Britton, Zack
Before Clustering: SI: 32, SL: 7 Specify Number of Clusters: 2 After Clustering: SI: 32, SL: 7
Is this clustering sufficient?: y 796 Glasnow, Tyler
Before Clustering: FF: 24, SL: 14, CU: 10 Specify Number of Clusters: 3 After Clustering: FF: 24, SL: 24
Is this clustering sufficient?: 797 Díaz, Miguel
Before Clustering: CH: 9, FF: 4, SI: 3, SL: 2 Specify Number of Clusters: 4 After Clustering: CH: 11, FF: 5, SL: 2
Is this clustering sufficient?: 798 Woods Richardson, Simeon
Before Clustering: FF: 50, SL: 17, CU: 13, CH: 11 Specify Number of Clusters: 4 After Clustering: FF: 50, SL: 25, CH: 12, CU: 4
Is this clustering sufficient?: 799 McGee, Easton
Before Clustering: SL: 19, SI: 15, FC: 6, CH: 3, CU: 2, FF: 1 Specify Number of Clusters: 6 After Clustering: SL: 18, SI: 17, CH: 7, CU: 3, FC: 1
Is this clustering sufficient?:
clus22 = pd.concat([data22_clus,new_clus])
clus22.to_csv('Clustering2022.csv', index=False)